!4896 [MS][LITE][Develop]int8 conv op

Merge pull request !4896 from ling/conv1x1

!4896 [MS][LITE][Develop]int8 conv op
Merge pull request !4896 from ling/conv1x1
9dd4ab0e · mindspore-ci-bot · Gitee · c6767c0f · 6cfcdaab · 9dd4ab0e
16 changed file
--- a/mindspore/lite/nnacl/int8/conv_int8.c
+++ b/mindspore/lite/nnacl/int8/conv_int8.c
@@ -367,6 +367,26 @@ void ConvInt8Opt(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight
  }
 }

+void Conv1x1Int8(const int8_t *packed_input, const int8_t *packed_weight, int8_t *dst, const int32_t *input_sum,
+                 const int32_t *bias, int row, int col, int deep16, ConvParameter *conv_param,
+                 MATMUL_OPT_R_FUNC matmul_func) {
+  if (matmul_func != NULL) {
+    matmul_func(packed_input, packed_weight, dst, row, col, deep16, conv_param->output_channel_, input_sum, bias,
+                conv_param->conv_quant_arg_.left_shift_, conv_param->conv_quant_arg_.right_shift_,
+                conv_param->conv_quant_arg_.quant_multiplier_, conv_param->conv_quant_arg_.output_quant_args_[0].zp_,
+                conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0],
+                (conv_param->conv_quant_arg_.filter_arg_num_ > 1));
+  } else {
+    MatMulInt8_16x4_r(packed_input, packed_weight, dst, row, col, deep16, conv_param->output_channel_, input_sum, bias,
+                      conv_param->conv_quant_arg_.left_shift_, conv_param->conv_quant_arg_.right_shift_,
+                      conv_param->conv_quant_arg_.quant_multiplier_,
+                      conv_param->conv_quant_arg_.output_quant_args_[0].zp_,
+                      conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0],
+                      (conv_param->conv_quant_arg_.filter_arg_num_ > 1));
+  }
+  return;
+}
+
 // int8 convolution 3x3
 void Conv3x3Int8(int16_t *input_data, int16_t *transed_weight, const int32_t *bias_data, int8_t *output_data,
                 int16_t *tile_buffer, int16_t *block_unit_buffer, int32_t *tmp_dst_buffer, int8_t *tmp_out,

--- a/mindspore/lite/nnacl/int8/conv_int8.h
+++ b/mindspore/lite/nnacl/int8/conv_int8.h
@@ -25,6 +25,8 @@
 #include "nnacl/conv_parameter.h"
 #include "nnacl/winograd_utils.h"
 #include "nnacl/quantization/quantize.h"
+#include "nnacl/matmul_parameter.h"
+#include "nnacl/int8/matmul_int8.h"

 typedef void (*GEMM_FUNC)(int8_t *dst, const int8_t *src, const int8_t *weight, const int32_t *bias, size_t ksize,
                          size_t ic4, size_t output_channel, size_t offset, const int32_t *input_sum, size_t act_min,
@@ -51,6 +53,11 @@ void ConvInt8Opt(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight
                 int32_t *tmp_dst, int8_t *tmp_out, int8_t *output_data, int32_t *input_sum, int task_id,
                 ConvParameter *conv_param, GEMM_FUNC gemm_func);

+// int8 convolution 1x1
+void Conv1x1Int8(const int8_t *packed_input, const int8_t *packed_weight, int8_t *dst, const int32_t *input_sum,
+                 const int32_t *bias, int row, int col, int deep16, ConvParameter *conv_param,
+                 MATMUL_OPT_R_FUNC matmul_func);
+
 // int8 convolution 3x3
 void Conv3x3Int8(int16_t *input_data, int16_t *transed_weight, const int32_t *bias_data, int8_t *output_data,
                 int16_t *tile_buffer, int16_t *block_unit_buffer, int32_t *tmp_dst_buffer, int8_t *tmp_out,

--- a/mindspore/lite/nnacl/int8/deconv.c
+++ b/mindspore/lite/nnacl/int8/deconv.c
@@ -172,73 +172,7 @@ void DeConvPackWeightSum(int8_t *weight, int32_t *weight_sum, int32_t input_zp,
 void DeConvPackInputSum(const int8_t *src, int32_t *dst, int32_t filter_zp, size_t row4, size_t col16,
                        bool suppport_opt) {
  /* optimize normal -> same layout */
-#ifdef ENABLE_ARM64
-  asm volatile(
-    "mov x10, %[src] \n"
-    "mov x11, %[dst] \n"
-    "dup v15.4s, %w[filter_zp]  \n"
-
-    "mov x0, #0 \n"
-    "1: \n"
-    "cmp x0, %[row4] \n"
-    "beq 4f \n"
-    "add x0, x0, #4\n"
-    "dup v10.4s, wzr \n"
-    "mov x2, #0 \n"
-
-    "2: \n"
-    "cmp x2, %[col16] \n"
-    "beq 3f \n"
-    "add x2, x2, #16\n"
-
-    "ld1 {v0.16b}, [x10], #16\n"
-    "ld1 {v1.16b}, [x10], #16\n"
-    "ld1 {v2.16b}, [x10], #16\n"
-    "ld1 {v3.16b}, [x10], #16\n"
-
-    "saddlp v4.8h, v0.16b \n"
-    "saddlp v5.8h, v1.16b \n"
-    "saddlp v6.8h, v2.16b \n"
-    "saddlp v7.8h, v3.16b \n"
-
-    "saddlp v0.4S, v4.8h \n"
-    "saddlp v1.4S, v5.8h \n"
-    "saddlp v2.4S, v6.8h \n"
-    "saddlp v3.4S, v7.8h \n"
-
-    "addv s4, v0.4S \n"
-    "addv s5, v1.4S \n"
-    "addv s6, v2.4S \n"
-    "addv s7, v3.4S \n"
-
-    "mov v0.s[0], v4.s[0] \n"
-    "mov v0.s[1], v5.s[0] \n"
-    "mov v0.s[2], v6.s[0] \n"
-    "mov v0.s[3], v7.s[0] \n"
-
-    "add v10.4s, v10.4s, v0.4s \n"
-    "b 2b\n"
-
-    "3: \n"
-    "mul v10.4s, v10.4s, v15.4s \n"
-    "st1 {v10.4s}, [x11], #16 \n"
-    "beq 1b \n"
-
-    "4: \n"
-
-    :
-    : [ dst ] "r"(dst), [ src ] "r"(src), [ row4 ] "r"(row4), [ col16 ] "r"(col16), [ filter_zp ] "r"(filter_zp)
-    : "x0", "x1", "x2", "x3", "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v10", "v15");
-#else
-  for (int r = 0; r < row4; r++) {
-    int32_t tmp_value = 0;
-    for (int c = 0; c < col16; c++) {
-      int r4div = r / C4NUM, r4mod = r % C4NUM, c16div = c / C16NUM, c16mod = c % C16NUM;
-      int src_index = r4div * C4NUM * col16 + c16div * C16NUM * C4NUM + r4mod * C16NUM + c16mod;
-      tmp_value += src[src_index];
-    }
-  }
-#endif
+  PackInputSum16x4PerLater(src, dst, filter_zp, row4, col16);
  return;
 }


--- a/mindspore/lite/nnacl/int8/matmul_int8.c
+++ b/mindspore/lite/nnacl/int8/matmul_int8.c
@@ -28,6 +28,19 @@ void RowMajor2Row8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col)
  }
 }

+void RowMajor2Row4x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col) {
+  int col16 = UP_ROUND(col, C16NUM);
+  for (int r = 0; r < row; r++) {
+    int rd4 = r / C4NUM;
+    int rm4 = r % C4NUM;
+    for (int c = 0; c < col; c++) {
+      int cd16 = c / C16NUM;
+      int cm16 = c % C16NUM;
+      dst_ptr[cd16 * col16 * C4NUM + rd4 * C4NUM * C16NUM + rm4 * C16NUM + cm16] = src_ptr[r * col16 + c];
+    }
+  }
+}
+
 void MatrixPack4x16UnitInt8(int8_t *src, int8_t *dst, int row, int col, int stride) {
  for (int r = 0; r < row; r++) {
    int8_t *src_r = src + r * stride;
@@ -145,7 +158,38 @@ void MatMulInt8_16x4(const int8_t *a, const int8_t *b, int *dst, int row_4, int
  return;
 }

-#ifdef ENABLE_ARM64
+void MatMulInt8_16x4_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16,
+                       size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
+                       int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi,
+                       bool per_channel) {
+  /*  row4x16-major * row16x4-major => (int8)row-major  : per-channel */
+  for (int r = 0; r < row; r++) {
+    for (int c = 0; c < col; c++) {
+      int r4div = r / C4NUM, r4mod = r % C4NUM;
+      int c4div = c / C4NUM, c4mod = c % C4NUM;
+      size_t ci = r * stride + c;
+      int32_t value = 0;
+      for (int d = 0; d < deep_16; d++) {
+        int d16div = d / C16NUM, d16mod = d % C16NUM;
+        size_t ai = r4div * deep_16 * C4NUM + d16div * C4NUM * C16NUM + r4mod * C16NUM + d16mod;
+        size_t bi = c4div * deep_16 * C4NUM + d16div * C4NUM * C16NUM + c4mod * C16NUM + d16mod;
+        value = value + a[ai] * b[bi];
+      }
+      int32_t cur_input_sum = per_channel ? input_sum[c4div * UP_ROUND(row, C4NUM) + r * C4NUM + c4mod] : input_sum[r];
+      value -= cur_input_sum;
+      value += bias[c];
+      int32_t cur_left_shift = per_channel ? left_shift[c] : left_shift[0];
+      int32_t cur_right_shift = per_channel ? right_shift[c] : right_shift[0];
+      int32_t cur_multiplier = per_channel ? multiplier[c] : multiplier[0];
+      value = MultiplyByQuantizedMultiplier(value, cur_multiplier, cur_left_shift, cur_right_shift) + output_zp;
+      value = MSMIN(maxi, value);
+      value = MSMAX(mini, value);
+      dst[ci] = (int8_t)value;
+    }
+  }
+  return;
+}
+
 void RowMajor2Row4x16Major(int8_t *src, int row, int col, int8_t *dst, int col_16) {
  int stride = sizeof(int8_t) * 16 * 4;
  for (int r = 0; r < row; ++r) {
@@ -201,4 +245,3 @@ void Row4x4Major2RowMajor(int8_t *src, int row4, int8_t *dst, int row, int cow)
    }
  }
 }
-#endif
--- a/mindspore/lite/nnacl/int8/matmul_int8.h
+++ b/mindspore/lite/nnacl/int8/matmul_int8.h
@@ -28,17 +28,22 @@ void MatMulInt8(const int8_t *a, const int8_t *b, int *c, const int row8, const
                const int a_zp, const int b_zp);
 void MatMulInt8_16x4(const int8_t *a, const int8_t *b, int *dst, int row_4, int col_4, int deep_16,
                     const int *input_sum, const int *bias);
+void MatMulInt8_16x4_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16,
+                       size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
+                       int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi,
+                       bool per_channel);
 void RowMajor2Row8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
+void RowMajor2Row4x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
 void RowMajor2Col8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
 void RowMajor2Row16x4MajorInt8(void *src_ptr, void *dst_ptr, int row, int col);

-#ifdef ENABLE_ARM64
 void RowMajor2Row4x16Major(int8_t *src, int row, int col, int8_t *dst, int col_16);
 void RowMajor2Col16x4Major(int8_t *src, int row, int col, int8_t *dst, int row_16);
 void RowMajor2Asums(int8_t *a, int row, int col, int b_zp, int *dst);
 void RowMajor2Bbias(int8_t *b, int row, int col, int a_zp, int b_zp, int *bias, int *dst);
 void Row4x4Major2RowMajor(int8_t *src, int row4, int8_t *dst, int row, int cow);

+#ifdef ENABLE_ARM64
 // bias = bias + depth * a_zp * b_zp - a_zp * b_sums
 void MatmulInt8Neon64(const int8_t *a, const int8_t *b, int8_t *dst, int row4, int col4, int deep16, const int *a_sums,
                      const int *bias, int act_min, int act_max, int out_zp, int multiplier, int left_shift,

--- a/mindspore/lite/nnacl/matmul_parameter.h
+++ b/mindspore/lite/nnacl/matmul_parameter.h
@@ -22,6 +22,11 @@
 typedef void (*MATMUL_OPT_R4_FUNC)(const int8_t *a, const int8_t *b, int *dst, int row_4, int col_4, int deep_16,
                                   const int *input_sum, const int *bias);

+typedef void (*MATMUL_OPT_R_FUNC)(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16,
+                                  size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
+                                  int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini,
+                                  int32_t maxi, bool per_channel);
+
 typedef void (*MAT_TRANS_FUNC)(void *dst, void *a, int row, int col);

 typedef enum ActType { ActType_No, ActType_Relu, ActType_Relu6 } ActType;

--- a/mindspore/lite/nnacl/opt_op_handler.c
+++ b/mindspore/lite/nnacl/opt_op_handler.c
@@ -15,6 +15,7 @@
 */

 #include <stdlib.h>
+#include <stdbool.h>

 #ifdef __cplusplus
 extern "C" {
@@ -45,4 +46,11 @@ void MatMulR4Int8_optimize_handler(const int8_t *a, const int8_t *b, int *dst, i
                                   const int *input_sum, const int *bias) {
  return MatMulOptR4Int8Neon64(a, b, dst, row4, col4, deep16, input_sum, bias);
 }
+
+void MatMulRInt8_optimize_handler(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16,
+                                  size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
+                                  int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini,
+                                  int32_t maxi, bool per_channel) {
+  return;
+}
 #endif
--- a/mindspore/lite/nnacl/pack.c
+++ b/mindspore/lite/nnacl/pack.c
@@ -153,22 +153,24 @@ void PackWeightInt8Opt(int8_t *weight_data, ConvParameter *conv_param, int8_t *p
  }        // kernel plane loop
 }

-void Conv1x1InputPackFp32(const float *src, float *dst, ConvParameter *conv_param) {
+void Conv1x1InputPack(const void *src_ptr, void *dst_ptr, ConvParameter *conv_param, int data_size) {
  /* support nhwc */
+  char *src = (char *)src_ptr;
+  char *dst = (char *)dst_ptr;
  for (int dst_h = 0; dst_h < conv_param->output_h_; dst_h++) {
    int src_h = dst_h * conv_param->stride_h_ - conv_param->pad_h_;
    if (src_h < 0 || src_h >= conv_param->input_h_) {
      continue;
    }
-    const float *src_h_ptr = src + src_h * conv_param->input_w_ * conv_param->input_channel_;
-    float *dst_h_ptr = dst + dst_h * conv_param->output_w_ * conv_param->input_channel_;
+    const char *src_h_ptr = src + src_h * conv_param->input_w_ * conv_param->input_channel_ * data_size;
+    char *dst_h_ptr = dst + dst_h * conv_param->output_w_ * conv_param->input_channel_ * data_size;
    for (int dst_w = 0; dst_w < conv_param->output_w_; dst_w++) {
      int src_w = dst_w * conv_param->stride_w_ - conv_param->pad_w_;
      if (src_w < 0 || src_w >= conv_param->input_w_) {
        continue;
      }
-      memcpy(dst_h_ptr + dst_w * conv_param->input_channel_, src_h_ptr + src_w * conv_param->input_channel_,
-             conv_param->input_channel_ * sizeof(float));
+      memcpy(dst_h_ptr + dst_w * conv_param->input_channel_ * data_size,
+             src_h_ptr + src_w * conv_param->input_channel_ * data_size, conv_param->input_channel_ * data_size);
    }
  }
  return;
@@ -188,6 +190,105 @@ void Pack1x1WeightFp32(const float *weight_data, float *packed_weight, ConvParam
  return;
 }

+void PackInputSum16x4PerLater(const int8_t *src, int32_t *dst, int32_t filter_zp, size_t row4, size_t col16) {
+  /* optimize normal -> same layout */
+#ifdef ENABLE_ARM64
+  asm volatile(
+    "mov x10, %[src] \n"
+    "mov x11, %[dst] \n"
+    "dup v15.4s, %w[filter_zp]  \n"
+
+    "mov x0, #0 \n"
+    "1: \n"
+    "cmp x0, %[row4] \n"
+    "beq 4f \n"
+    "add x0, x0, #4\n"
+    "dup v10.4s, wzr \n"
+    "mov x2, #0 \n"
+
+    "2: \n"
+    "cmp x2, %[col16] \n"
+    "beq 3f \n"
+    "add x2, x2, #16\n"
+
+    "ld1 {v0.16b}, [x10], #16\n"
+    "ld1 {v1.16b}, [x10], #16\n"
+    "ld1 {v2.16b}, [x10], #16\n"
+    "ld1 {v3.16b}, [x10], #16\n"
+
+    "saddlp v4.8h, v0.16b \n"
+    "saddlp v5.8h, v1.16b \n"
+    "saddlp v6.8h, v2.16b \n"
+    "saddlp v7.8h, v3.16b \n"
+
+    "saddlp v0.4S, v4.8h \n"
+    "saddlp v1.4S, v5.8h \n"
+    "saddlp v2.4S, v6.8h \n"
+    "saddlp v3.4S, v7.8h \n"
+
+    "addv s4, v0.4S \n"
+    "addv s5, v1.4S \n"
+    "addv s6, v2.4S \n"
+    "addv s7, v3.4S \n"
+
+    "mov v0.s[0], v4.s[0] \n"
+    "mov v0.s[1], v5.s[0] \n"
+    "mov v0.s[2], v6.s[0] \n"
+    "mov v0.s[3], v7.s[0] \n"
+
+    "add v10.4s, v10.4s, v0.4s \n"
+    "b 2b\n"
+
+    "3: \n"
+    "mul v10.4s, v10.4s, v15.4s \n"
+    "st1 {v10.4s}, [x11], #16 \n"
+    "beq 1b \n"
+
+    "4: \n"
+
+    :
+    : [ dst ] "r"(dst), [ src ] "r"(src), [ row4 ] "r"(row4), [ col16 ] "r"(col16), [ filter_zp ] "r"(filter_zp)
+    : "x0", "x1", "x2", "x3", "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v10", "v15");
+#else
+  for (int r = 0; r < row4; r++) {
+    int32_t tmp_value = 0;
+    for (int c = 0; c < col16; c++) {
+      int r4div = r / C4NUM, r4mod = r % C4NUM, c16div = c / C16NUM, c16mod = c % C16NUM;
+      int src_index = r4div * C4NUM * col16 + c16div * C16NUM * C4NUM + r4mod * C16NUM + c16mod;
+      tmp_value += src[src_index];
+    }
+    dst[r] = tmp_value * filter_zp;
+  }
+#endif
+  return;
+}
+
+void PackInputSum16x4Int8(int8_t *input_value, int32_t *input_sum, size_t input_channel, size_t output_channel,
+                          size_t plane_size, ConvParameter *conv_param) {
+  size_t hw4 = UP_ROUND(plane_size, C4NUM);
+  size_t ic16 = UP_ROUND(input_channel, C16NUM);
+  if (conv_param->conv_quant_arg_.filter_arg_num_ == 1) {
+    PackInputSum16x4PerLater(input_value, input_sum, conv_param->conv_quant_arg_.filter_quant_args_[0].zp_, hw4, ic16);
+  } else {
+    for (int ri = 0; ri < plane_size; ri++) {
+      int ri4div = ri / C4NUM, ri4mod = ri % C4NUM;
+      for (int ci = 0; ci < output_channel; ci++) {
+        int32_t tmp_sum_value = 0;
+        int ci4div = ci / C4NUM, ci4mod = ci % C4NUM;
+        int32_t filter_zp = conv_param->conv_quant_arg_.filter_quant_args_[ci].zp_;
+        for (int di = 0; di < input_channel; di++) {
+          size_t di16div = di / C16NUM, di16mod = di % C16NUM;
+          int src_index = ri4div * C4NUM * ic16 + di16div * C16NUM * C4NUM + ri4mod * C16NUM + di16mod;
+          tmp_sum_value += input_value[src_index];
+        }
+        int dst_index = ci4div * C4NUM * hw4 + ri * C4NUM + ci4mod;
+        input_sum[dst_index] = tmp_sum_value * filter_zp;
+      }
+    }
+  }
+  return;
+}
+
 void Im2ColPackUnitFp32(const float *input_data, ConvParameter *conv_param, float *packed_input, int real_cal_num,
                        int block_index) {
  // input format : nhwc

--- a/mindspore/lite/nnacl/pack.h
+++ b/mindspore/lite/nnacl/pack.h
@@ -35,10 +35,15 @@ void Im2ColPackUnitInt8(const int8_t *input_data, int8_t *packed_input, int real
 void Im2ColPackUnitInt8Opt(const int8_t *input_data, int8_t *packed_input, int real_cal_num, int block_index,
                           int32_t *input_sum, ConvParameter *conv_param);

-void Conv1x1InputPackFp32(const float *src, float *dst, ConvParameter *conv_param);
+void PackInputSum16x4PerLater(const int8_t *src, int32_t *dst, int32_t filter_zp, size_t row4, size_t col16);
+
+void Conv1x1InputPack(const void *src_ptr, void *dst_ptr, ConvParameter *conv_param, int data_size);

 void Pack1x1WeightFp32(const float *weight_data, float *packed_weight, ConvParameter *conv_param);

+void PackInputSum16x4Int8(int8_t *input_value, int32_t *input_sum, size_t input_channel, size_t output_channel,
+                          size_t plane_size, ConvParameter *conv_param);
+
 void MatrixPack(const float *src, float *dst, int row, int ic4, int stride);

 void PackInputToC8Int8(const int8_t *input_data, int16_t *packed_input, ConvParameter *conv_param);

--- a/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc
@@ -118,10 +118,13 @@ int ConvolutionBaseCPUKernel::CheckLayout(lite::tensor::Tensor *input_tensor) {
 }

 int ConvolutionBaseCPUKernel::SetIfPerChannel() {
+  auto filter_tensor = in_tensors_.at(kWeightIndex);
+  auto input_channel = filter_tensor->Channel();
+  auto output_channel = filter_tensor->Batch();
+
  uint8_t per_channel = 0b0;
  if (conv_quant_arg_->input_arg_num_ != kPerTensor) {
-    int in_channel = conv_param_->input_channel_;
-    if (static_cast<int>(conv_quant_arg_->input_arg_num_) != in_channel) {
+    if (static_cast<int>(conv_quant_arg_->input_arg_num_) != input_channel) {
      MS_LOG(ERROR) << "input per channel quant param length is not equal to input channel.";
      return RET_ERROR;
    }
@@ -129,8 +132,7 @@ int ConvolutionBaseCPUKernel::SetIfPerChannel() {
  }

  if (conv_quant_arg_->filter_arg_num_ != kPerTensor) {
-    int filter_num = conv_param_->output_channel_;
-    if (static_cast<int>(conv_quant_arg_->filter_arg_num_) != filter_num) {
+    if (static_cast<int>(conv_quant_arg_->filter_arg_num_) != output_channel) {
      MS_LOG(ERROR) << "weight per channel quant param length is not equal to filter num.";
      return RET_ERROR;
    }
@@ -138,8 +140,7 @@ int ConvolutionBaseCPUKernel::SetIfPerChannel() {
  }

  if (conv_quant_arg_->output_arg_num_ != kPerTensor) {
-    int out_channel = conv_param_->output_channel_;
-    if (static_cast<int>(conv_quant_arg_->output_arg_num_) != out_channel) {
+    if (static_cast<int>(conv_quant_arg_->output_arg_num_) != output_channel) {
      MS_LOG(ERROR) << "output per channel quant param length is not equal to output channel.";
      return RET_ERROR;
    }

--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc
@@ -113,7 +113,7 @@ void Convolution1x1CPUKernel::Pre1x1Trans(float *src_input, float *src_output) {
  output_ptr_ = src_output;

  if (pre_trans_input_) {
-    Conv1x1InputPackFp32(src_input, input_ptr_, conv_param_);
+    Conv1x1InputPack(src_input, input_ptr_, conv_param_, sizeof(float));
  } else {
    input_ptr_ = src_input;
  }

--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/kernel/arm/int8/convolution_1x1_int8.h"
+#include "src/runtime/runtime_api.h"
+
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_MEMORY_FAILED;
+using mindspore::lite::RET_OK;
+
+namespace mindspore::kernel {
+
+Convolution1x1Int8CPUKernel::~Convolution1x1Int8CPUKernel() {
+  if (matmul_param_ != nullptr) {
+    delete matmul_param_;
+    matmul_param_ = nullptr;
+  }
+  if (packed_weight_ != nullptr) {
+    delete packed_weight_;
+    packed_weight_ = nullptr;
+  }
+  FreeResizeBuf();
+  FreeQuantParam();
+}
+
+void Convolution1x1Int8CPUKernel::FreeResizeBuf() {
+  if (packed_input_ != nullptr) {
+    free(packed_input_);
+    packed_input_ = nullptr;
+  }
+  if (input_sum_ != nullptr) {
+    free(input_sum_);
+    input_sum_ = nullptr;
+  }
+  return;
+}
+
+void Convolution1x1Int8CPUKernel::CheckSupportOptimize() {
+  support_optimize_ = false;
+  matmul_func_ = MatMulInt8_16x4_r;
+#ifdef ENABLE_ARM64
+  void *optimize_op_handler = OptimizeModule::GetInstance()->optimized_op_handler_;
+  if (optimize_op_handler != nullptr) {
+    dlerror();
+    *(reinterpret_cast<void **>(&matmul_func_)) = dlsym(optimize_op_handler, "MatMulRInt8_optimize_handler");
+    auto dlopen_error = dlerror();
+    if (dlopen_error != nullptr) {
+      MS_LOG(ERROR) << "load matmul func failed! " << dlopen_error << ".";
+      support_optimize_ = false;
+      matmul_func_ = nullptr;
+    } else {
+      support_optimize_ = true;
+    }
+  } else {
+    support_optimize_ = false;
+    matmul_func_ = nullptr;
+  }
+#endif
+
+  matmul_func_ = MatMulInt8_16x4_r;
+  return;
+}
+
+int Convolution1x1Int8CPUKernel::InitWeightBias() {
+  auto filter_tensor = in_tensors_.at(kWeightIndex);
+  auto input_channel = filter_tensor->Channel();
+  auto output_channel = filter_tensor->Batch();
+
+  /* weight */
+  size_t size = UP_ROUND(input_channel, C16NUM) * UP_ROUND(output_channel, C4NUM) * sizeof(int8_t);
+  packed_weight_ = reinterpret_cast<int8_t *>(malloc(size));
+  if (packed_weight_ == nullptr) {
+    MS_LOG(ERROR) << "Conv1x1 int8 Malloc weight error!";
+    return RET_ERROR;
+  }
+  memset(packed_weight_, 0, size);
+  RowMajor2Row4x16MajorInt8(reinterpret_cast<int8_t *>(filter_tensor->Data()), packed_weight_, output_channel,
+                            input_channel);
+
+  /* bias = bias - v2 x zp1 + zp1 x zp2  */
+  int col4 = UP_ROUND(output_channel, C4NUM);
+  bias_data_ = malloc(col4 * sizeof(int32_t));
+  if (bias_data_ == nullptr) {
+    MS_LOG(ERROR) << "Conv1x1 int8 Malloc bias_ptr_ error!";
+    return RET_ERROR;
+  }
+  memset(bias_data_, 0, col4 * sizeof(int32_t));
+  if (in_tensors_.size() == 3) {
+    memcpy(bias_data_, in_tensors_[kBiasIndex]->Data(), output_channel * sizeof(int32_t));
+  }
+
+  int32_t *bias_data = reinterpret_cast<int32_t *>(bias_data_);
+  int8_t *weight = reinterpret_cast<int8_t *>(filter_tensor->Data());
+  int32_t input_zp = conv_param_->conv_quant_arg_.input_quant_args_[0].zp_;
+  for (int oc = 0; oc < output_channel; oc++) {
+    int32_t weight_sum_value = 0;
+    int32_t filter_zp = (conv_param_->conv_quant_arg_.filter_arg_num_ == 1)
+                          ? conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_
+                          : conv_param_->conv_quant_arg_.filter_quant_args_[oc].zp_;
+    for (int ic = 0; ic < input_channel; ic++) {
+      weight_sum_value += weight[oc * input_channel + ic];
+    }
+    bias_data[oc] += filter_zp * input_zp * input_channel - weight_sum_value * input_zp;
+  }
+  return RET_OK;
+}
+
+int Convolution1x1Int8CPUKernel::Init() {
+  if (!InferShapeDone()) {
+    return RET_OK;
+  }
+  matmul_param_ = new (std::nothrow) MatMulParameter();
+  if (matmul_param_ == nullptr) {
+    MS_LOG(ERROR) << "Init matmul_param_ failed.";
+    return RET_ERROR;
+  }
+
+  CheckSupportOptimize();
+
+  auto ret = SetQuantParam();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Set quant param failed.";
+    return ret;
+  }
+
+  ret = InitWeightBias();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Init weight bias failed.";
+    return ret;
+  }
+
+  return ReSize();
+}
+
+int Convolution1x1Int8CPUKernel::InitParam() {
+  pre_trans_input_ = (conv_param_->pad_h_ != 0 || conv_param_->pad_w_ != 0 || conv_param_->stride_h_ != 1 ||
+                      conv_param_->stride_w_ != 1);
+
+  matmul_param_->row_ = conv_param_->output_h_ * conv_param_->output_w_;
+  matmul_param_->deep_ = conv_param_->input_channel_;
+  matmul_param_->col_ = conv_param_->output_channel_;
+
+  thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, C4NUM));
+  thread_stride_ = UP_DIV(UP_DIV(matmul_param_->col_, C4NUM), thread_count_);
+
+  size_t size = UP_ROUND(matmul_param_->row_, C4NUM) * UP_ROUND(matmul_param_->deep_, C16NUM);
+  packed_input_ = reinterpret_cast<int8_t *>(malloc(size * sizeof(int8_t)));
+  if (packed_input_ == nullptr) {
+    MS_LOG(ERROR) << "conv1x1 int8 Malloc packed_input_ error!";
+    return RET_ERROR;
+  }
+  memset(packed_input_, 0, size * sizeof(int8_t));
+
+  if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) {
+    size = UP_ROUND(conv_param_->output_channel_, C4NUM) * UP_ROUND(matmul_param_->row_, C4NUM);
+  } else {
+    size = UP_ROUND(matmul_param_->row_, C4NUM);
+  }
+  input_sum_ = reinterpret_cast<int32_t *>(malloc(size * sizeof(int32_t)));
+  if (input_sum_ == nullptr) {
+    MS_LOG(ERROR) << "malloc input_sum_ failed.";
+    return RET_ERROR;
+  }
+  memset(input_sum_, 0, size * sizeof(int32_t));
+
+  return RET_OK;
+}
+
+int Convolution1x1Int8CPUKernel::ReSize() {
+  FreeResizeBuf();
+
+  ConvolutionBaseCPUKernel::Init();
+
+  int error_code = InitParam();
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "Convolution base init failed.";
+    return error_code;
+  }
+  return RET_OK;
+}
+
+void Convolution1x1Int8CPUKernel::Pre1x1Trans(int8_t *src_input, int8_t *src_output) {
+  output_ptr_ = src_output;
+  if (pre_trans_input_) {
+    Conv1x1InputPack(src_input, input_ptr_, conv_param_, sizeof(int8_t));
+  } else {
+    input_ptr_ = src_input;
+  }
+  RowMajor2Row16x4MajorInt8(input_ptr_, packed_input_, matmul_param_->row_, matmul_param_->deep_);
+  return;
+}
+
+int Convolution1x1Int8CPUKernel::RunImpl(int task_id) {
+  int cur_oc = MSMIN(thread_stride_ * C4NUM, matmul_param_->col_ - task_id * thread_stride_ * C4NUM);
+  if (cur_oc <= 0) {
+    return RET_OK;
+  }
+
+  int32_t *bias = reinterpret_cast<int32_t *>(bias_data_) + thread_stride_ * C4NUM * task_id;
+
+  Conv1x1Int8(packed_input_, packed_weight_ + task_id * thread_stride_ * C4NUM * matmul_param_->deep_,
+              output_ptr_ + task_id * thread_stride_ * C4NUM, input_sum_, bias + task_id * thread_stride_ * C4NUM,
+              matmul_param_->row_, cur_oc, UP_ROUND(matmul_param_->deep_, C16NUM), conv_param_, matmul_func_);
+  return RET_OK;
+}
+
+int Convolution1x1Int8Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+  auto conv = reinterpret_cast<Convolution1x1Int8CPUKernel *>(cdata);
+  auto error_code = conv->RunImpl(task_id);
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "conv1x1 Int8 Run error task_id[" << task_id << "] error_code[" << error_code << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int Convolution1x1Int8CPUKernel::Run() {
+  auto ret = Prepare();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Prepare failed.";
+    return RET_ERROR;
+  }
+
+  if (pre_trans_input_) {
+    input_ptr_ =
+      reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(matmul_param_->row_ * matmul_param_->deep_ * sizeof(int8_t)));
+    if (input_ptr_ == nullptr) {
+      MS_LOG(ERROR) << "Conv1x1 int8 Malloc input_ptr_ error!";
+      return RET_MEMORY_FAILED;
+    }
+  }
+
+  int8_t *src_in = reinterpret_cast<int8_t *>(in_tensors_[0]->Data());
+  int8_t *src_out = reinterpret_cast<int8_t *>(out_tensors_[0]->Data());
+
+  for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) {
+    Pre1x1Trans(src_in + batch_index * conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_channel_,
+                src_out + batch_index * matmul_param_->row_ * matmul_param_->col_);
+
+    PackInputSum16x4Int8(packed_input_, input_sum_, matmul_param_->deep_, matmul_param_->col_, matmul_param_->row_,
+                         conv_param_);
+
+    int error_code = LiteBackendParallelLaunch(Convolution1x1Int8Impl, this, thread_count_);
+    if (error_code != RET_OK) {
+      MS_LOG(ERROR) << "conv1x1 fp16 error error_code[" << error_code << "]";
+      return RET_ERROR;
+    }
+  }
+
+  if (pre_trans_input_ && input_ptr_ != nullptr) {
+    ctx_->allocator->Free(input_ptr_);
+    input_ptr_ = nullptr;
+  }
+
+  return RET_OK;
+}
+}  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_1x1_INT8_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_1x1_INT8_H_
+
+#include <vector>
+#include "src/lite_kernel.h"
+#include "include/errorcode.h"
+#include "schema/model_generated.h"
+#include "src/runtime/kernel/arm/base/convolution_base.h"
+#include "nnacl/int8/conv_int8.h"
+#include "nnacl/int8/matmul_int8.h"
+#include "nnacl/matmul_parameter.h"
+#include "nnacl/optimized_kernel.h"
+
+namespace mindspore::kernel {
+class Convolution1x1Int8CPUKernel : public ConvolutionBaseCPUKernel {
+ public:
+  Convolution1x1Int8CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
+                              const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx,
+                              const mindspore::lite::PrimitiveC *primitive)
+      : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
+  ~Convolution1x1Int8CPUKernel() override;
+
+  int Init() override;
+  int ReSize() override;
+  int Run() override;
+
+ public:
+  int RunImpl(int task_id);
+
+ private:
+  void FreeResizeBuf();
+  int InitParam();
+  int InitWeightBias();
+  void Pre1x1Trans(int8_t *src_input, int8_t *src_output);
+  void CheckSupportOptimize();
+
+ private:
+  int32_t *input_sum_ = nullptr; /* per-channel: oc4 format */
+  int8_t *packed_weight_ = nullptr;
+  int8_t *packed_input_ = nullptr;
+  int8_t *input_ptr_ = nullptr;
+  int8_t *output_ptr_ = nullptr;
+  size_t thread_count_ = 1;
+  size_t thread_stride_ = 0;
+  bool pre_trans_input_ = false;
+  MatMulParameter *matmul_param_ = nullptr;
+  MATMUL_OPT_R_FUNC matmul_func_ = nullptr;
+  bool support_optimize_ = false;
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_1x1_INT8_H_
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc
@@ -16,6 +16,7 @@

 #include "src/runtime/kernel/arm/int8/convolution_int8.h"
 #include "src/runtime/kernel/arm/int8/convolution_3x3_int8.h"
+#include "src/runtime/kernel/arm/int8/convolution_1x1_int8.h"
 #include "nnacl/int8/conv_int8.h"
 #include "src/runtime/kernel/arm/base/layout_transform.h"
 #include "schema/model_generated.h"
@@ -400,6 +401,9 @@ kernel::LiteKernel *CpuConvInt8KernelCreator(const std::vector<lite::tensor::Ten
  kernel::LiteKernel *kernel;
  if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) {
    kernel = new (std::nothrow) kernel::Convolution3x3Int8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
+  } else if (kernel_h == 1 && kernel_w == 1) {
+    /* Convolution1x1Int8CPUKernel */
+    kernel = new (std::nothrow) kernel::ConvolutionInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
  } else {
    kernel = new (std::nothrow) kernel::ConvolutionInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
  }

--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/conv1x1_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/conv1x1_fp32_tests.cc
@@ -54,7 +54,7 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack1) {
  conv_param->pad_h_ = conv_param->pad_w_ = 2;

  float out[20] = {0};
-  Conv1x1InputPackFp32(in, out, conv_param);
+  Conv1x1InputPack(in, out, conv_param, sizeof(float));
  EXPECT_EQ(0, lite::CompareOutputData(out, correct, 20));
  delete conv_param;
 }
@@ -95,7 +95,7 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack2) {
  conv_param->pad_h_ = conv_param->pad_w_ = 0;

  float out[28] = {0};
-  Conv1x1InputPackFp32(in, out, conv_param);
+  Conv1x1InputPack(in, out, conv_param, sizeof(float));
  CompareOutputData(out, correct, 28, 0.0001);
  delete conv_param;
 }
@@ -114,7 +114,7 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack3) {
  float correct[] = {0.0,       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 17.025112,
                     -5.052577, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};

-  Conv1x1InputPackFp32(in, out, conv_param);
+  Conv1x1InputPack(in, out, conv_param, sizeof(float));
  EXPECT_EQ(0, lite::CompareOutputData(out, correct, 18));
  delete conv_param;
 }
@@ -136,7 +136,7 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack4) {
                     -1.770, 41.903, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,    0.0,    0.0,    0.0,
                     0.0,    0.0,    0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,    0.0};
  float out[54] = {0};
-  Conv1x1InputPackFp32(in, out, conv_param);
+  Conv1x1InputPack(in, out, conv_param, sizeof(float));
  EXPECT_EQ(0, lite::CompareOutputData(out, correct, 54));
  delete conv_param;
 }

--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/conv_1x1_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/conv_1x1_int8_tests.cc
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "utils/log_adapter.h"
+#include "common/common_test.h"
+#include "mindspore/lite/src/lite_kernel.h"
+#include "src/common/file_utils.h"
+#include "nnacl/quantization/quantize.h"
+#include "nnacl/common_func.h"
+#include "mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h"
+
+namespace mindspore {
+using lite::tensor::Tensor;
+class TestConv1x1Int8 : public mindspore::CommonTest {
+ public:
+  TestConv1x1Int8() {}
+};
+
+TEST_F(TestConv1x1Int8, Input1x1PrePack1) {
+  auto conv_param = new ConvParameter();
+  conv_param->input_channel_ = 6;
+  conv_param->input_h_ = conv_param->input_w_ = 3;
+  conv_param->output_h_ = conv_param->output_w_ = 3;
+  conv_param->stride_h_ = conv_param->stride_w_ = 2;
+  conv_param->pad_h_ = conv_param->pad_w_ = 1;
+  int8_t in[] = {4,  13,  -3, 16, 19, 8,  19, -6, -2, -9, 9,  18, 23, 8,  47, -14, 15, 4,
+                 -0, 37,  -0, 6,  0,  -1, 37, 13, 11, 1,  -1, 41, 9,  14, 3,  0,   8,  9,
+                 14, -14, -8, -8, -8, 7,  19, 17, 13, 3,  9,  18, -1, -0, 18, 0,   4,  -2};
+  int8_t correct[] = {0, 0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 37, 13, 11,
+                      1, -1, 41, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0};
+  int8_t out[54] = {0};
+  Conv1x1InputPack(in, out, conv_param, sizeof(int8_t));
+  CompareOutputData(out, correct, 54, 0);
+  delete conv_param;
+}
+
+TEST_F(TestConv1x1Int8, Input1x1PrePack2) {
+  auto conv_param = new ConvParameter();
+  int8_t in[] = {-0, -0, -7, -0, -6, 4,  9,  9,  12, -0, 6,  2,  13, 15, 16, -7, 9,  1,  10, 13, 17, 17, 4,  13,
+                 -6, 5,  7,  -7, 15, 0,  1,  -5, -7, 18, 15, 19, -7, 13, 7,  -0, 16, -5, 16, -7, 6,  10, -5, 10,
+                 9,  12, -9, -8, -4, 18, -5, 0,  7,  12, 13, 16, -9, -4, 18, -0, 8,  6,  2,  10, 16, 1,  -1, 2,
+                 9,  8,  9,  13, 7,  -0, 15, -7, 0,  -0, 17, 19, 9,  17, -6, -2, 7,  -0, 10, -6, -6, 18, -0, 9,
+                 9,  6,  3,  -1, -8, 10, 17, -9, 17, 6,  -3, 7,  -2, -0, -9, 1,  -3, 15, 13, 4,  18};
+  int8_t correct[] = {0, 0, 0, 0, 0, 0, 15, -7, -7, 0, 0, 0, 9, 7, 0, 0, 0, 0, 0, 0};
+
+  conv_param->input_h_ = 9;
+  conv_param->input_w_ = 13;
+  conv_param->input_channel_ = 1;
+  conv_param->output_h_ = 4;
+  conv_param->output_w_ = 5;
+  conv_param->stride_h_ = conv_param->stride_w_ = 4;
+  conv_param->pad_h_ = conv_param->pad_w_ = 2;
+
+  int8_t out[20] = {0};
+  Conv1x1InputPack(in, out, conv_param, sizeof(int8_t));
+  CompareOutputData(out, correct, 20, 0);
+  delete conv_param;
+}
+
+int Conv1x1Int8TestInit1_perchannel(std::vector<lite::tensor::Tensor *> *inputs_,
+                                    std::vector<lite::tensor::Tensor *> *outputs_, ConvParameter *conv_param,
+                                    int8_t **correct) {
+  Tensor *in_t = new Tensor(kNumberTypeInt8, {1, 2, 3, 4}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
+  auto in_quant_arg = new mindspore::lite::tensor::QuantArg();
+  in_quant_arg->zeroPoint = -42, in_quant_arg->scale = 0.117647;
+  in_t->AddQuantParam(*in_quant_arg);
+  in_t->MallocData();
+  int8_t in[] = {62,  -14, 88, 2,   -35, 43,  83,  -111, 75,  26, 14,  -121,
+                 -78, 56,  37, -31, 15,  -75, -10, -115, -71, 74, -65, -15};
+  memcpy(in_t->Data(), in, in_t->ElementsNum() * sizeof(int8_t));
+  inputs_->push_back(in_t);
+
+  Tensor *weight_t = new Tensor(kNumberTypeInt8, {3, 1, 1, 4}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
+  weight_t->MallocData();
+  auto weight_quant_arg1 = new mindspore::lite::tensor::QuantArg();
+  weight_quant_arg1->zeroPoint = 66, weight_quant_arg1->scale = 0.96439215686275;
+  auto weight_quant_arg2 = new mindspore::lite::tensor::QuantArg();
+  weight_quant_arg2->zeroPoint = 33, weight_quant_arg2->scale = 0.76439215686275;
+  auto weight_quant_arg3 = new mindspore::lite::tensor::QuantArg();
+  weight_quant_arg3->zeroPoint = -20, weight_quant_arg3->scale = 0.99117647;
+  weight_t->AddQuantParam(*weight_quant_arg1);
+  weight_t->AddQuantParam(*weight_quant_arg2);
+  weight_t->AddQuantParam(*weight_quant_arg3);
+  int8_t weight[] = {65, 67, 65, 65, 32, 33, 34, 33, -19, -20, -19, -20};
+  memcpy(weight_t->Data(), weight, weight_t->ElementsNum() * sizeof(int8_t));
+  inputs_->push_back(weight_t);
+
+  Tensor *out_t = new Tensor(kNumberTypeInt8, {1, 2, 3, 3}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
+  out_t->MallocData();
+  auto output_quant_arg = new mindspore::lite::tensor::QuantArg();
+  output_quant_arg->zeroPoint = 7, output_quant_arg->scale = 0.294321233;
+  out_t->AddQuantParam(*output_quant_arg);
+  outputs_->push_back(out_t);
+
+  *correct = reinterpret_cast<int8_t *>(malloc(out_t->ElementsNum() * sizeof(int8_t)));
+  int8_t nchw_co[] = {-83, 34, 100, 10, 113, 55, 3, 16, 63, 6, 93, 20, 5, 6, 42, 35, 28, -24};
+  memcpy(*correct, nchw_co, out_t->ElementsNum() * sizeof(int8_t));
+
+  conv_param->kernel_h_ = conv_param->kernel_w_ = 1;
+  conv_param->stride_h_ = conv_param->stride_w_ = 1;
+  conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
+  conv_param->pad_h_ = conv_param->pad_w_ = 0;
+  conv_param->is_relu_ = conv_param->is_relu6_ = false;
+  return out_t->ElementsNum();
+}
+
+TEST_F(TestConv1x1Int8, Conv1x1TestPerChannel) {
+  std::vector<lite::tensor::Tensor *> inputs_;
+  std::vector<lite::tensor::Tensor *> outputs_;
+  auto conv_param = new ConvParameter();
+  int8_t *correct;
+  auto ctx = new lite::Context;
+  ctx->thread_num_ = 1;
+  int total_size = Conv1x1Int8TestInit1_perchannel(&inputs_, &outputs_, conv_param, &correct);
+  kernel::Convolution1x1Int8CPUKernel *conv1x1 = new kernel::Convolution1x1Int8CPUKernel(
+    reinterpret_cast<OpParameter *>(conv_param), inputs_, outputs_, ctx, nullptr);
+
+  conv1x1->Init();
+  conv1x1->Run();
+  CompareOutputData(reinterpret_cast<int8_t *>(outputs_[0]->Data()), correct, total_size, 70);
+
+  delete conv1x1;
+  for (auto t : inputs_) delete t;
+  for (auto t : outputs_) delete t;
+  free(correct);
+}
+
+int Conv1x1Int8TestInit1(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<lite::tensor::Tensor *> *outputs_,
+                         ConvParameter *conv_param, int8_t **correct) {
+  Tensor *in_t = new Tensor(kNumberTypeInt8, {1, 2, 3, 4}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
+  auto in_quant_arg = new mindspore::lite::tensor::QuantArg();
+  in_quant_arg->zeroPoint = -42, in_quant_arg->scale = 0.117647;
+  in_t->AddQuantParam(*in_quant_arg);
+  in_t->MallocData();
+  float in[] = {12.216284, 3.3466918,  15.327419, 5.234958,  0.804376,   9.952188,  14.727955,  -8.080715,
+                13.71383,  8.055829,   6.5845337, -9.25232,  -4.24519,   11.550042, 9.262012,   1.2780352,
+                6.7263746, -3.9301445, 3.764492,  -8.602078, -3.3558068, 13.619035, -2.6694393, 3.2008505};
+  Quantize(in, in_t->ElementsNum(), in_quant_arg->scale, in_quant_arg->zeroPoint,
+           reinterpret_cast<int8_t *>(in_t->Data()));
+  inputs_->push_back(in_t);
+
+  Tensor *weight_t = new Tensor(kNumberTypeInt8, {3, 1, 1, 4}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
+  auto weight_quant_arg = new mindspore::lite::tensor::QuantArg();
+  weight_quant_arg->zeroPoint = 66, weight_quant_arg->scale = 0.036439215686275;
+  weight_t->AddQuantParam(*weight_quant_arg);
+  weight_t->MallocData();
+  float weight[] = {-0.7308652, 0.5257509,  -0.87825793, -1.123181,   -1.2206168, 0.562695,
+                    1.5382664,  -0.5020635, 0.8591602,   -0.26410004, 1.1262615,  0.073132955};
+  Quantize(weight, weight_t->ElementsNum(), weight_quant_arg->scale, weight_quant_arg->zeroPoint,
+           reinterpret_cast<int8_t *>(weight_t->Data()));
+  inputs_->push_back(weight_t);
+
+  Tensor *out_t = new Tensor(kNumberTypeInt8, {1, 2, 3, 3}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
+  out_t->MallocData();
+  auto output_quant_arg = new mindspore::lite::tensor::QuantArg();
+  output_quant_arg->zeroPoint = 7, output_quant_arg->scale = 0.234321233;
+  out_t->AddQuantParam(*output_quant_arg);
+  outputs_->push_back(out_t);
+
+  *correct = reinterpret_cast<int8_t *>(malloc(out_t->ElementsNum() * sizeof(int8_t)));
+  float nchw_co[] = {-26.51016327, 7.92113757, 27.25741343, 0.785643655,  31.3307619, 14.05927672,
+                     -1.178490666, 2.5676252,  16.39408946, -0.394793726, 25.2866881, 3.827249175,
+                     -0.626854507, -0.3122176, 10.42769169, 8.362184085,  6.04617807, -9.252362384};
+  Quantize(nchw_co, out_t->ElementsNum(), output_quant_arg->scale, output_quant_arg->zeroPoint, *correct);
+
+  conv_param->kernel_h_ = conv_param->kernel_w_ = 1;
+  conv_param->stride_h_ = conv_param->stride_w_ = 1;
+  conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
+  conv_param->pad_h_ = conv_param->pad_w_ = 0;
+  conv_param->is_relu_ = conv_param->is_relu6_ = false;
+  return out_t->ElementsNum();
+}
+
+TEST_F(TestConv1x1Int8, Conv1x1Int8Test1) {
+  std::vector<lite::tensor::Tensor *> inputs_;
+  std::vector<lite::tensor::Tensor *> outputs_;
+  auto conv_param = new ConvParameter();
+  int8_t *correct;
+  auto ctx = new lite::Context;
+  ctx->thread_num_ = 1;
+  int total_size = Conv1x1Int8TestInit1(&inputs_, &outputs_, conv_param, &correct);
+  kernel::Convolution1x1Int8CPUKernel *conv1x1 = new kernel::Convolution1x1Int8CPUKernel(
+    reinterpret_cast<OpParameter *>(conv_param), inputs_, outputs_, ctx, nullptr);
+
+  conv1x1->Init();
+  conv1x1->Run();
+  CompareOutputData(reinterpret_cast<int8_t *>(outputs_[0]->Data()), correct, total_size, 2);
+
+  delete conv1x1;
+  for (auto t : inputs_) delete t;
+  for (auto t : outputs_) delete t;
+  free(correct);
+}
+
+int Conv1x1Int8TestInit2(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<lite::tensor::Tensor *> *outputs_,
+                         ConvParameter *conv_param, int8_t **correct) {
+  size_t buffer_size;
+  Tensor *in_t = new Tensor(kNumberTypeInt8, {1, 2, 3, 4}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
+  auto in_quant_arg = new mindspore::lite::tensor::QuantArg();
+  in_quant_arg->zeroPoint = -42, in_quant_arg->scale = 0.117647;
+  in_t->AddQuantParam(*in_quant_arg);
+  in_t->MallocData();
+  std::string input_path = "./input";
+  auto input = mindspore::lite::ReadFile(input_path.c_str(), &buffer_size);
+  memcpy(in_t->Data(), input, buffer_size);
+  inputs_->push_back(in_t);
+  delete[] input;
+
+  Tensor *weight_t = new Tensor(kNumberTypeInt8, {3, 1, 1, 4}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
+  auto weight_quant_arg = new mindspore::lite::tensor::QuantArg();
+  weight_quant_arg->zeroPoint = 66, weight_quant_arg->scale = 0.036439215686275;
+  weight_t->AddQuantParam(*weight_quant_arg);
+  weight_t->MallocData();
+  std::string weight_path = "./weight";
+  auto weight = mindspore::lite::ReadFile(weight_path.c_str(), &buffer_size);
+  memcpy(weight_t->Data(), weight, buffer_size);
+  inputs_->push_back(weight_t);
+  delete[] weight;
+
+  Tensor *bias_t = new Tensor(kNumberTypeInt32, {4}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
+  weight_t->MallocData();
+  std::string bias_path = "./bias";
+  auto bias = mindspore::lite::ReadFile(bias_path.c_str(), &buffer_size);
+  memcpy(bias_t->Data(), bias, buffer_size);
+  inputs_->push_back(bias_t);
+  delete[] bias;
+
+  Tensor *out_t = new Tensor(kNumberTypeInt8, {1, 2, 3, 3}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
+  out_t->MallocData();
+  auto output_quant_arg = new mindspore::lite::tensor::QuantArg();
+  output_quant_arg->zeroPoint = 7, output_quant_arg->scale = 0.234321233;
+  out_t->AddQuantParam(*output_quant_arg);
+  outputs_->push_back(out_t);
+
+  *correct = reinterpret_cast<int8_t *>(malloc(out_t->ElementsNum() * sizeof(int8_t)));
+  std::string output_path = "./output";
+  auto output = mindspore::lite::ReadFile(output_path.c_str(), &buffer_size);
+  memcpy(*correct, output, buffer_size);
+  delete[] output;
+
+  conv_param->kernel_h_ = conv_param->kernel_w_ = 1;
+  conv_param->stride_h_ = conv_param->stride_w_ = 1;
+  conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
+  conv_param->pad_h_ = conv_param->pad_w_ = 0;
+  conv_param->is_relu_ = conv_param->is_relu6_ = false;
+  return out_t->ElementsNum();
+}
+
+TEST_F(TestConv1x1Int8, Conv1x1Int8Test2) {
+  std::vector<lite::tensor::Tensor *> inputs_;
+  std::vector<lite::tensor::Tensor *> outputs_;
+  auto conv_param = new ConvParameter();
+  int8_t *correct;
+  auto ctx = new lite::Context;
+  ctx->thread_num_ = 1;
+  int total_size = Conv1x1Int8TestInit2(&inputs_, &outputs_, conv_param, &correct);
+  kernel::Convolution1x1Int8CPUKernel *conv1x1 = new kernel::Convolution1x1Int8CPUKernel(
+    reinterpret_cast<OpParameter *>(conv_param), inputs_, outputs_, ctx, nullptr);
+
+  conv1x1->Init();
+  conv1x1->Run();
+  CompareOutputData(reinterpret_cast<int8_t *>(outputs_[0]->Data()), correct, total_size, 2);
+
+  delete conv1x1;
+  for (auto t : inputs_) delete t;
+  for (auto t : outputs_) delete t;
+  free(correct);
+}
+}  // namespace mindspore